%matplotlib inline
import h5py
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, f1_score, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
skip=['PIG-UPMC-pig55-T0-hgomez-2013-04-15-12-30-10-7'] # ground truth file is blank
%%time
histmaps = []
histmap_dir='/zfsauton/data/public/mcba/MicroScan_videos_2013/mcba_v2/preprocessed/N200_frames/feat_wHistMap_Hfilters'
for filename in glob.glob('{}/*.mat'.format(histmap_dir)):
f = h5py.File(filename,'r')
data = f.get('histMap')
data = np.array(data)
vid = os.path.basename(filename)
vid = '-'.join(vid.split('_')[1:]).split('.')[0]
if vid in skip:
continue
histmaps.append((vid, data.T))
histmaps.sort(key=lambda x: x[0])
def load(ds_dir, skip):
ds = []
for filename in glob.glob('{}/*.npy'.format(ds_dir)):
data = np.load(filename)
vid = os.path.basename(filename)
vid = vid.split('_')[0]
if vid in skip:
continue
ds.append((vid, data))
ds.sort(key=lambda x:x[0])
return ds
%%time
labels = []
label_dir='/zfsauton/data/public/mcba/MicroScan_videos_2013/ground_truth/'
for filename in glob.glob('{}/*.npy'.format(label_dir)):
data = np.load(filename)
vid = os.path.basename(filename)
vid = vid.split('_')[0]
if vid in skip:
continue
labels.append((vid, data))
labels.sort(key=lambda x:x[0])
%%time
labels = load(ds_dir='/zfsauton/data/public/mcba/MicroScan_videos_2013/ground_truth',
skip=skip)
%%time
data = pd.DataFrame()
for idx, ((fname1, histmap), (fname2, label)) in enumerate(zip(histmaps, labels)):
df = pd.DataFrame(histmap.reshape(-1, histmap.shape[-1]), columns=['h'+str(i) for i in range(histmap.shape[-1])])
df['label'] = label.flatten()
df['id'] = idx
data = data.append(df, ignore_index=True)
# data.to_feather('all_data.feather')
data = pd.read_feather('all_data.feather')
data.label.value_counts()
data.label.value_counts(normalize=True)
%%time
data_ava = load(ds_dir='/zfsauton/data/public/mcba/MicroScan_videos_2013/ava_auto/detected_vessels',
skip=skip)
%%time
mcba_old = load(ds_dir='/zfsauton/data/public/mcba/MicroScan_videos_2013/analysis/overlays',
skip=skip)
# Fix the truncated name issue
vids = {e[0] for e in histmaps}
mcba_old_renamed = []
for v, data in mcba_old:
for vid in vids:
if v in vid:
mcba_old_renamed.append((vid, data))
break
mcba_old = mcba_old_renamed
rename
for (fname1, histmap), (fname2, label)in zip(histmaps, labels):
assert(fname1 == fname2)
fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(1, 6, figsize=(10, 8), dpi=150)
ax2.imshow(np.sum(histmap, axis=-1) > np.mean(np.sum(histmap, axis=-1)))
ax2.set_title('Mean')
ax2.set_axis_off()
ax3.imshow(np.sum(histmap, axis=-1) > np.median(np.sum(histmap, axis=-1)))
ax3.set_title('Median')
ax3.set_axis_off()
ax5.imshow(np.sum(histmap>0, axis=-1))
ax5.set_title('Non zeros')
ax5.set_axis_off()
ax1.imshow(np.sum(histmap, axis=-1) > np.percentile(np.sum(histmap, axis=-1), 25))
ax1.set_title('25%')
ax1.text(0, 0, fname1)
ax1.set_axis_off()
ax4.imshow(np.sum(histmap, axis=-1) > np.percentile(np.sum(histmap, axis=-1), 75))
ax4.set_title('75%')
ax4.set_axis_off()
ax6.imshow(label)
ax6.set_title('Annotation')
ax6.set_axis_off()
# fig.suptitle(fname1)
%%time
test_id = 50
test_idx = data[data.id == test_id].index
train_idx = data[data.id != test_id].index
X_train, y_train = data.iloc[train_idx, 0:100], (data.label > -2)[train_idx]
X_test, y_test = data.iloc[test_idx, 0:100], (data.label > -2)[test_idx]
y_train.value_counts(normalize=True)
Use non zeros at each pixel
bin_data = data.filter(like='h').copy()
bin_data[bin_data > 0] = 1
bin_data = pd.concat([bin_data, data[['label', 'id']]], axis=1)
y_pred = bin_data.filter(like='h').any(axis=1)
accuracy(data.label, y_pred)
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay(confusion_matrix(data.label, y_pred))
%%time
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1))
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
plot_confusion_matrix(pipe['clf'], pipe['pca'].transform(X_test), y_test, normalize='true')
downsampled = resample(data[(data.id != test_id) & (data.label == -2)], replace=False, n_samples=len(data[(data.id != test_id) & (data.label > -2)]), random_state=27)
train = downsampled.append(data[(data.id != test_id) & (data.label > -2)])
X_train, y_train = train.iloc[:, 0:100], (train.label > -2)
y_train.value_counts(normalize=True)
%%time
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1))
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
plot_confusion_matrix(pipe['clf'], pipe['pca'].transform(X_test), y_test, normalize='true')
plot_roc_curve(pipe['clf'], pipe['pca'].transform(X_test), y_test)
%%time
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', LogisticRegression(random_state=0, n_jobs=-1))
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
plot_confusion_matrix(pipe['clf'], pipe['pca'].transform(X_test), y_test, normalize='true')
plot_roc_curve(pipe['clf'], pipe['pca'].transform(X_test), y_test)
from sklearn.metrics import accuracy_score
def metrics(labels, data_ava, mcba_old, histmaps):
df = pd.DataFrame(columns=['vid', 'model', 'metric', 'score'])
for (name, label), (_, ava), (_, old), (_, new) in zip(labels, data_ava, mcba_old, histmaps):
label, ava, old, new = np.copy(label).flatten(), np.copy(ava).flatten(), np.copy(old).flatten(), np.copy(new)
label[label > -2] = 1
label[label <= -2] = 0
ava[ava > -2] = 1
ava[ava <= -2] = 0
old[old > 0] = 1
h = np.sum(new>0, axis=-1)
threshold=h.mean()
h[h<=threshold] = 0
h[h>threshold] = 1
h = h.flatten()
assert(label.shape == h.shape)
def add_model(model, yhat):
return (
df.append(dict(vid=name, model=model, metric='accuracy', score=accuracy_score(label, yhat)), ignore_index=True)
.append(dict(vid=name, model=model, metric='f1', score=f1_score(label, yhat)), ignore_index=True)
)
df = add_model('ava', ava)
df = add_model('old', old)
df = add_model('new', h)
return df
%%time
df = metrics(labels, data_ava, mcba_old, histmaps)
pd.pivot_table(df, values='score', index=['vid', 'model'], columns='metric')
pd.pivot_table(df, index=['model'], values='score', columns='metric')
assert(len(histmaps) == len(labels))
for (fname1, histmap), (fname2, label), (fname3, ava), (fname4, old) in zip(histmaps, labels, data_ava, mcba_old):
assert(fname1 == fname2)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(10, 8), dpi=150)
ax1.imshow(ava)
ax1.set_title('AVA')
ax1.text(0, 0, fname1)
ax1.set_axis_off()
ax2.imshow(old)
ax2.set_title('MCBA old')
ax2.set_axis_off()
ax3.imshow(np.sum(histmap>0, axis=-1))
ax3.set_title('MCBA new')
ax3.set_axis_off()
ax4.imshow(label)
ax4.set_title('Annotation')
ax4.set_axis_off()
# fig.suptitle(fname1)
%%time
test_id = 50
test_idx = data[(data.id == test_id) & (data.label >= 0)].index
train_idx = data[(data.id != test_id) & (data.label >= 0)].index
X_train, y_train = data.iloc[train_idx, 0:100], data.iloc[train_idx, 100]
X_test, y_test = data.iloc[test_idx, 0:100], data.iloc[test_idx, 100]
X_train.shape
X_test.shape
y_train.value_counts(normalize=True)
from sklearn.ensemble import RandomForestClassifier
%%time
clf = RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, f1_score
plot_confusion_matrix(clf, X_test, y_test)
f1_score(y_test, y_pred, average='micro')
%%time
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1))
], verbose=True)
pipe.fit(X_train, y_train)
print('Number of compoents: {}'.format(pipe['pca'].n_components_))
pipe.score(X_test, y_test)
from sklearn.svm import SVC
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', SVC(random_state=0))
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', LogisticRegression(random_state=0, n_jobs=-1))
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
from sklearn.naive_bayes import GaussianNB
pipe = Pipeline([
('pca', PCA(n_components=0.95)),
('clf', GaussianNB())
], verbose=True)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
plot_confusion_matrix(pipe['clf'], pipe['pca'].transform(X_test), y_test)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
n_classes = len(y_test.unique())
%%time
clf = OneVsRestClassifier(RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1), n_jobs=-1)
clf.fit(X_train, y_train)
ax = plt.subplot()
for estimator, label in zip(clf.estimators_, clf.classes_):
plot_roc_curve(estimator, X_test, y_test, name=label, ax=ax)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
from sklearn.utils import resample
n_samples = data[(data.id != test_id) & (data.label == 0)].shape[0]
def upsample(label):
to_upsample_idx = data[(data.id != test_id) & (data.label == label)].index
return resample(data.iloc[to_upsample_idx, :], replace=True, n_samples=n_samples, random_state=27)
upsampled_4 = upsample(4)
upsampled_1 = upsample(1)
to_downsample_idx = data[(data.id != test_id) & (data.label == 3)].index
downsampled = resample(data.iloc[to_downsample_idx, :], replace=False, n_samples=n_samples, random_state=27)
bal_idx = data[(data.id != test_id) & (data.label.isin([0, 2]))].index
train_bal = data.iloc[bal_idx, :]
for sampled in [upsampled_1, upsampled_4, downsampled]:
train_bal = train_bal.append(sampled)
train_bal = train_bal.reset_index(drop=True)
# Generate training data from the balanced data
X_train, y_train = train_bal.iloc[:, 0:100], train_bal.iloc[:, 100]
y_train.value_counts(normalize=True)
%%time
clf = OneVsRestClassifier(RandomForestClassifier(max_samples=5000, random_state=0, n_jobs=-1), n_jobs=-1)
clf.fit(X_train, y_train)
ax = plt.subplot()
for estimator, label in zip(clf.estimators_, clf.classes_):
plot_roc_curve(estimator, X_test, y_test, name=label, ax=ax)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)